summary(df$lfp_female)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.7 56.8 62.1 61.9 67.6 100.0
hist(df$lfp_female, breaks=20, main="Female LFP in Canada", xlab='Female LFP %')
abline(v=mean(df$lfp_female), col='red')
random_regs <- sample(unique(df[df$population > 16000, ]$region_name), size=6)
for (region in random_regs){
region_data <- df[df$region_name == region,]
plot(region_data['lfp_female'], main=region)
}
## [,1]
## pca1_stock 2.2266596
## avg_hh_size 2.5163871
## med_hh_income_1000 78.2328457
## avg_rooms_per_dwelling 6.1683318
## percent_hh_with_children 0.4089649
## lfp_male 70.1691613
## percent_drivers_female 0.4434255
## percent_publictransit_female 0.5712643
\[H_0: \text{driver}_{F} \geq \text{driver}_M \\ H_1: \text{driver}_{F} \lt \text{driver}_M\]
t.test(commute_modes_female$driver, commute_modes_male$driver, alternative='less', var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_modes_female$driver and commute_modes_male$driver
## t = -81.425, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -150.698
## sample estimates:
## mean of the differences
## -153.8055
Reject null hypothesis; less women commute as drivers than men.
\[H_0: \text{transit}_{F} \leq \text{transit}_M \\ H_1: \text{transit}_{F} \gt \text{transit}_M\]
t.test(commute_modes_female$transit, commute_modes_male$transit, alternative='greater', var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_modes_female$transit and commute_modes_male$transit
## t = 48.281, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 49.17677 Inf
## sample estimates:
## mean of the differences
## 50.91152
Reject null hypothesis; more women commute by public transit than men.
T-tests: all stat sig that male and female are diff
t.test(commute_time_female$t15, commute_time_male$t15, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t15 and commute_time_male$t15
## t = 40.692, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 37.10181 40.85763
## sample estimates:
## mean of the differences
## 38.97972
t.test(commute_time_female$t15to29, commute_time_male$t15to29, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t15to29 and commute_time_male$t15to29
## t = -20.524, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -23.36934 -19.29425
## sample estimates:
## mean of the differences
## -21.3318
t.test(commute_time_female$t30to44, commute_time_male$t30to44, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t30to44 and commute_time_male$t30to44
## t = -56.133, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -51.19870 -47.74324
## sample estimates:
## mean of the differences
## -49.47097
t.test(commute_time_female$t45to59, commute_time_male$t45to59, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t45to59 and commute_time_male$t45to59
## t = -30.318, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -19.02849 -16.71713
## sample estimates:
## mean of the differences
## -17.87281
t.test(commute_time_female$t60, commute_time_male$t60, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t60 and commute_time_male$t60
## t = -34.61, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -28.42807 -25.38022
## sample estimates:
## mean of the differences
## -26.90415
df_vars <- df_no_geom[iv_colnames]
df_vars$lfp_female <- df_no_geom$lfp_female
model_all <- lm(lfp_female ~ ., data=df_vars)
summary(model_all)
##
## Call:
## lm(formula = lfp_female ~ ., data = df_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.894 -2.413 0.032 2.332 46.788
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -19.534571 0.928780 -21.032 < 2e-16 ***
## pca1_stock -0.311675 0.039182 -7.955 2.17e-15 ***
## avg_hh_size -1.242855 0.301665 -4.120 3.85e-05 ***
## med_hh_income_1000 -0.014824 0.003476 -4.265 2.03e-05 ***
## avg_rooms_per_dwelling 0.313202 0.091063 3.439 0.000587 ***
## percent_hh_with_children 1.601108 1.110579 1.442 0.149448
## lfp_male 0.989699 0.008558 115.652 < 2e-16 ***
## percent_drivers_female 29.050132 1.316910 22.059 < 2e-16 ***
## percent_publictransit_female 2.632025 0.392777 6.701 2.28e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.942 on 5416 degrees of freedom
## Multiple R-squared: 0.7725, Adjusted R-squared: 0.7722
## F-statistic: 2299 on 8 and 5416 DF, p-value: < 2.2e-16
plot(model_all$residuals, main='LFP ~ . : residuals', ylab='residual')
model_no_sndi <- lm(lfp_female ~ . -pca1_stock, data=df_vars)
anova(model_no_sndi, model_all)
quadratic_model <- lm(lfp_female ~ poly(pca1_stock, 2), data=df_vars)
summary(quadratic_model)
##
## Call:
## lm(formula = lfp_female ~ poly(pca1_stock, 2), data = df_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.879 -5.088 0.157 5.696 38.325
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.9024 0.1120 552.916 < 2e-16 ***
## poly(pca1_stock, 2)1 0.4185 8.2461 0.051 0.96
## poly(pca1_stock, 2)2 36.8133 8.2461 4.464 8.19e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.246 on 5422 degrees of freedom
## Multiple R-squared: 0.003663, Adjusted R-squared: 0.003295
## F-statistic: 9.966 on 2 and 5422 DF, p-value: 4.782e-05
ggplot(df_vars, aes(x=pca1_stock, y=lfp_female)) + geom_point() + stat_smooth(se=F, method='lm', formula=y~poly(x,2)) + labs(title='LFP ~ SNDI + SNDI^2', y='Female LFP (%)', x='SNDI')
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31.70 55.45 60.75 61.77 68.40 84.50
##
## Call:
## lm(formula = lfp_female ~ ., data = mtl_data_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9560 -2.6888 -0.1098 2.8328 14.6956
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -46.77406 4.52755 -10.331 < 2e-16 ***
## pca1_stock -0.95257 0.20592 -4.626 4.86e-06 ***
## avg_hh_size 12.59185 2.34778 5.363 1.30e-07 ***
## med_hh_income_1000 -0.04236 0.01903 -2.226 0.026529 *
## avg_rooms_per_dwelling 1.83731 0.55175 3.330 0.000939 ***
## percent_hh_with_children -53.81883 6.58213 -8.177 2.91e-15 ***
## lfp_male 1.01565 0.03206 31.679 < 2e-16 ***
## percent_drivers_female 21.55553 3.95562 5.449 8.28e-08 ***
## percent_publictransit_female 24.94393 3.44850 7.233 2.00e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.344 on 457 degrees of freedom
## Multiple R-squared: 0.7965, Adjusted R-squared: 0.7929
## F-statistic: 223.6 on 8 and 457 DF, p-value: < 2.2e-16
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38.4 54.3 59.7 60.3 65.7 93.0
##
## Call:
## lm(formula = lfp_female ~ ., data = to_data_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3944 -2.3390 0.0719 2.4194 20.6767
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -22.67732 3.28606 -6.901 1.41e-11 ***
## pca1_stock -0.66500 0.14723 -4.517 7.68e-06 ***
## avg_hh_size -0.99671 0.92902 -1.073 0.283800
## med_hh_income_1000 -0.03934 0.01032 -3.812 0.000153 ***
## avg_rooms_per_dwelling 1.30731 0.30125 4.340 1.70e-05 ***
## percent_hh_with_children -10.91745 3.30729 -3.301 0.001025 **
## lfp_male 0.97139 0.03066 31.681 < 2e-16 ***
## percent_drivers_female 17.83610 3.46265 5.151 3.60e-07 ***
## percent_publictransit_female 22.22891 2.98261 7.453 3.52e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.861 on 556 degrees of freedom
## Multiple R-squared: 0.7987, Adjusted R-squared: 0.7959
## F-statistic: 275.8 on 8 and 556 DF, p-value: < 2.2e-16